library(data.table)
library(ggplot2)
library(plyr)
library(magrittr)
library(readr)
library(stringr)
library(CellBarcode)
library(ggrepel)
library(ROCR)
library(ggsci)

theme0 <- theme_bw() + theme(
    text = element_text(size = 15),
    line = element_line(size = 1),
    axis.line = element_line(size = 1),
    axis.ticks.length = unit(3, units = "mm"),
    axis.text.x = element_text(
        margin = margin(t = 2, unit = "mm")
        , angle = 60, vjust = 1, size = 15, hjust = 1),
    axis.text.y = element_text(margin = margin(r = 3, l = 5, unit = "mm")),
    legend.position = "right",
) 
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
theme0_lt = theme0 + theme(legend.position = "top")
theme1 = theme0 + theme(
    axis.text.x = element_text( 
        margin = margin(t = 2, unit = "mm")
        , angle = 0, vjust = 1, size = 12, hjust = 0.5)
)

knitr::opts_chunk$set(fig.width=20, fig.height=12) 

Sample info

sample_info = fread("../run_simulation_no_umi/non_umi_simu_design_matrix.tsv")
sample_info$simu_id %<>% as.character
setkey(sample_info, "simu_id")

i_level_simu_name = sample_info[order(as.integer(simu_id)), simu_name][c(1:26)]

Clone size disbribution

true_barcode_l = read_rds("../run_preprocess_simulation/tmp/non_umi_simulation_ref.rds")
d_true_barcode = true_barcode_l %>% rbindlist(idcol = "simu_file") 
x = d_true_barcode$simu_file %>% str_match("barcode_simulation_\\d+_simu_seq_(.*)") %>% extract(, c(2))
table(x, useNA = "always")
## x
##     1    10    11    12    13    14    15    16    17    18    19     2    20 
##  8999  8998  8999  8998  8999  8999  8998  9000  8997  8998  8863  4499  8854 
##    21    22    23    24    25    26    27    28     3     4     5     6     7 
##  8999  6861  6869  6855 11971 20058 17997 35978 17989 35981  8998  8999  9000 
##     8     9  <NA> 
##  8999  8996     0
d_true_barcode = cbind(d_true_barcode, simu_id = x)
# d_true_barcode[simu_name == "hamming" & grepl("simulation_19_", simu_file), .N, by = barcode_seq][N > 1]
# d_true_barcode[simu_name == "vdj_barcode" & grepl("simulation_1_", simu_file), .N, by = barcode_seq][N > 1]
# d_true_barcode[simu_name == "vdj_barcode"]
d_true_barcode$simu_name = sample_info[d_true_barcode$simu_id, simu_name]
table(d_true_barcode$simu_name, useNA = "always")
## 
##                                barcode_length_10 
##                                             8999 
##     barcode_size_mean_3_variation_1_clone_n_1200 
##                                            35978 
##      barcode_size_mean_3_variation_1_clone_n_600 
##                                            17997 
##                                             base 
##                                             8999 
##                                     clone_n_1200 
##                                            35981 
##                                      clone_n_150 
##                                             4499 
##                                      clone_n_600 
##                                            17989 
##                                         cycle_20 
##                                             8996 
##                                         cycle_40 
##                                             8998 
##                                   efficiency_0.5 
##                                             8999 
##                                   efficiency_0.9 
##                                             8998 
##                                       error_1e-5 
##                                             8999 
##                                       error_1e-7 
##                                             8999 
##                                          hamming 
##                                             8863 
##                         hamming_size_variation_3 
##                                             8854 
##                                 ngs_profile_MSv1 
##                                             8998 
##                                     pcr_read_100 
##                                             9000 
##                                    pcr_read_1000 
##                                             8997 
##                                      pcr_read_25 
##                                             8998 
##                                    size_mean_0.6 
##                                             8998 
##                                      size_mean_3 
##                                             8999 
##                                 size_variation_1 
##                                             9000 
##                                 size_variation_3 
##                                             8999 
##                                      vdj_barcode 
##                                             6861 
## vdj_barcode_size_mean_3_variation_1_clone_n_1200 
##                                            20058 
##  vdj_barcode_size_mean_3_variation_1_clone_n_600 
##                                            11971 
##                     vdj_barcode_size_variation_1 
##                                             6869 
##                     vdj_barcode_size_variation_3 
##                                             6855 
##                                             <NA> 
##                                                0
d_true_barcode = rename(d_true_barcode, c("seq" = "barcode_seq"))

## Barcode frequency
ggplot(d_true_barcode[, .(freq = sum(freq)), by = .(simu_file, barcode_seq, simu_name)]) + aes(x = simu_name, y = freq) +
    geom_boxplot() +
    # geom_jitter(alpha = 0.2) + 
    theme0 + scale_y_log10() +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 53975 rows containing missing values (`stat_boxplot()`).

AUC

# NOTE: input
d_auc = fread("./tmp/table_auc_no_umi_reference.tsv")
d_count_true_barcode = fread("./tmp/table_count_true_barcode_reference.tsv")

count - true barcode

ggplot(d_count_true_barcode) + aes(x = simu_name, y = count, color = factor(is_true)) +
    geom_boxplot(alpha = 0.2) +
    theme0_lt + scale_y_log10() + scale_color_npg() +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 60505 rows containing missing values (`stat_boxplot()`).
## Warning: Removed 34 rows containing non-finite values (`stat_boxplot()`).

AUC versus simu conditions

ggplot(d_auc) + aes(x = simu_name, y = auc) + geom_boxplot() + theme0 +
    scale_x_discrete(limits = i_level_simu_name) +
    labs(y = "AUC") + lims(y = c(0, 1))
## Warning: Removed 60 rows containing missing values (`stat_boxplot()`).

ggplot(d_auc) + aes(x = simu_name, y = aucpr) + geom_boxplot() + theme0 +
    scale_x_discrete(limits = i_level_simu_name) +
    labs(y = "P-R AUC") + lims(y = c(0, 1))
## Warning: Removed 60 rows containing missing values (`stat_boxplot()`).

Resolusion index

d_count_true_barcode = fread("./tmp/table_count_true_barcode_reference.tsv")

# d_count_true_barcode[, count_cat := cut(clone_size, breaks = c(0, 1, 2, 3, 4, 5, 10, 20, Inf))]
# lapply(1:100, function(i) {
# d_count_true_barcode[, sum(count < i & is_true), ,by = .(clone_size, simu_file)]
# })

Apply the autothreshold strategy

# NOTE: input
d = fread("./tmp/table_no_umi_auto_threshold_predict_rate_reference.tsv")

# ggplot(d) + aes(x = tpr, y = fpr, label = simu_name) + geom_point() +
# geom_label_repel(alpha = 0.5)
d_plot_pr = melt(d, id.vars = "simu_name", measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value") 
d_plot_tf = melt(d, id.vars = "simu_name", measure.vars=c("tpr", "fpr"), variable.name = "pr", value.name = "value") 

ggplot(d_plot_pr) + aes(x = simu_name, y = value, fill = factor(pr)) + 
    geom_boxplot() + theme0 +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_pr) + aes(x = simu_name, y = 1 - value) + 
    geom_boxplot() +
    theme0 + facet_grid(pr ~ .) +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_tf) + aes(x = simu_name, y = value) + 
    geom_boxplot() +
    theme0 + facet_grid(pr ~ .) +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

Apply the threshold of 0.0001 of left reads

# NOTE: input
d = fread("./tmp/table_no_umi_p0001_threshold_predict_rate_reference.tsv")

# ggplot(d) + aes(x = tpr, y = fpr, label = simu_name) + geom_point() +
# geom_label_repel(alpha = 0.5)
d_plot_pr = melt(d, id.vars = "simu_name", measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value") 
d_plot_tf = melt(d, id.vars = "simu_name", measure.vars=c("tpr", "fpr"), variable.name = "pr", value.name = "value") 

ggplot(d_plot_pr) + aes(x = simu_name, y = value, fill = factor(pr)) + 
    geom_boxplot() + theme0 +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_pr) + aes(x = simu_name, y = value) + 
    geom_boxplot() +
    theme0 + facet_grid(pr ~ .) +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

ggplot(d_plot_tf) + aes(x = simu_name, y = value) + 
    geom_boxplot() +
    theme0 + facet_grid(pr ~ .) +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 120 rows containing missing values (`stat_boxplot()`).

Merge threshold

d1 = fread("./tmp/table_no_umi_p0001_threshold_predict_rate_reference.tsv")[, resolution := "res0.0001"]
d2 = fread("./tmp/table_no_umi_p00001_threshold_predict_rate_reference.tsv")[, resolution := "res0.00001"]

d = rbindlist(list(d1, d2))
d = melt(d, id.vars = c("simu_name", "resolution"), measure.vars=c("pre", "rec"), variable.name = "pr", value.name = "value") 
d$resolution %<>% factor(levels = c("res0.0001", "res0.00001"))

ggplot(d) + aes(x = simu_name, y = value, color = resolution) +
    geom_boxplot() + 
    theme0 + facet_grid(pr ~ .) + scale_color_npg() +
    scale_x_discrete(limits = i_level_simu_name)
## Warning: Removed 240 rows containing missing values (`stat_boxplot()`).